# 用于解析xml和html
from lxml import etree

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>aTitle发发发1</title>
</head>
<body>
    <ul>
        <li id="item1">醒醒啦1</li>
        <li id="item2">醒醒啦2</li>
        <li id="item3">醒醒啦3</li>
        <li id="item4">醒醒啦4</li>
        <li id="item5">醒醒啦5</li>
    </ul>
    <p>
        hello
        i
        am
        iron
        man
    </p>
</body>
</html>
"""

tree = etree.HTML(html)
# Element html
print(tree, dir(tree))
print(tree.tag, tree.attrib['lang'], tree.text)

head = tree.find("head")
title = head.find("title")
print(title.tag, title.text)

body = tree.find("body")
ul = body.find("ul")
lis = ul.findall("li")

for li in lis:
	print(li.text,  li.attrib["id"])


